knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.align = 'center') knitr::opts_knit$set(root.dir = "/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex")
library(PorexploreR) library(data.table) library(rhdf5) library(ggplot2) library(parallel) library(changepoint) library(caret) library(pbapply) library(GenomicAlignments) library(ggbeeswarm)
readFast5 <- function(file, NT) { reads <- rhdf5::h5ls(file = file, recursive = FALSE)$name parallel::mclapply(X = reads, FUN = function(read) { # h5closeAll() h5_signals <- tryCatch(rhdf5::h5read(file = file, name = paste0(read, "/Raw"), read.attributes = TRUE), error = function(e) NA) if(is.na(h5_signals)) return(NA) signal_meta <- rhdf5::h5read(file, paste0(read, "/channel_id"), read.attributes = TRUE) range <- attr(signal_meta, "range") digitisation <- attr(signal_meta, "digitisation") scaling <- range/digitisation offset <- attr(signal_meta, "offset") sampling_rate <- attr(signal_meta, "sampling_rate") new("Squiggle", raw_signal = as.integer(h5_signals[[1]]), range = as.numeric(range), digitisation = as.integer(digitisation), offset = as.integer(offset), sampling_rate = as.integer(sampling_rate), scaling = as.numeric(scaling)) }, mc.cores = NT) -> res names(res) <- reads return(res) } normalize_signal <- function(sig) { med = median(sig) mad = median(abs(sig - med)) (sig - med) / max(0.01, (mad * 1.4826)) } GetBarcode3 <- function(read, length = 20000, plot = TRUE) { raw_sig <- PorexploreR::signal(read) if(length(raw_sig) > length) raw_sig <- raw_sig[seq_len(length)] Mat <- data.table::data.table(x = seq_along(raw_sig), norm = normalize_signal(raw_sig)) Mat[, dema := smoother::smth(norm, method = "dema", n = 20)] cp2 <- Mat[!is.na(dema), suppressWarnings(changepoint::cpt.meanvar(dema, class = FALSE))[[1]]] + Mat[!is.na(dema), min(x)] ansmean <- tryCatch(suppressWarnings(changepoint::cpt.meanvar(Mat[cp2:nrow(Mat), dema], penalty = "Asymptotic", pen.value = 1e-10, method = "PELT")), error = function(e) NA) if(is.na(ansmean)) return(NULL) whichBin <- which.max(diff(c(0, head(ansmean@cpts, 4)))) polyA_Pos <- ifelse(whichBin == 1, cp2, ansmean@cpts[whichBin - 1] + cp2) polyA_Sig <- ansmean@param.est$mean[whichBin] if(polyA_Pos < 2500) { BCSs <- NULL } else { if(mean(Mat[round(polyA_Pos*0.75):polyA_Pos, dema] < polyA_Sig) > 0.9) { # Mat[1:polyA_Pos, smth := smoother::smth(norm, method = "dema", n = round(polyA_Pos/400), v = 1)] # BCSs <- Mat[!is.na(smth), smth] BCSs <- Mat[1:polyA_Pos, norm] if(plot) { Mat[1:polyA_Pos, smth := smoother::smth(norm, method = "dema", n = round(polyA_Pos/400), v = 1)] Mat[, plot(x, norm, type = "s")] Mat[, lines(x, smth, type = "s", col = 2)] abline(v = polyA_Pos, col = 3, lty = 2, lwd = 3) } } else { if(mean(Mat[round(cp2*0.75):cp2, dema] < polyA_Sig) > 0.9) { BCSs <- Mat[1:cp2, norm] if(plot) { Mat[1:polyA_Pos, smth := smoother::smth(norm, method = "dema", n = round(polyA_Pos/400), v = 1)] Mat[, plot(x, norm, type = "s")] Mat[1:cp2, lines(x, smth, type = "s", col = 2)] abline(v = cp2, col = 3, lty = 2, lwd = 3) } } else { BCSs <- NULL } } } if(!is.null(BCSs)) { if(length(BCSs) < 2100) { BCSs <- NULL } } return(BCSs) } MyChangePoint <- function(sig, MinLength = 10, ChangePoints = 68, StateStat = "Mean") { if(is.null(StateStat) | is.na(StateStat)) { stop("StateStat must be one of Mean or Median") } if(length(StateStat) != 1) { stop("StateStat must be one of Mean or Median") } if(!is.element(StateStat, c("Mean", "Median"))) { stop("StateStat must be one of Mean or Median") } cp0 <- suppressWarnings(changepoint::cpt.meanvar(data = sig, Q = ChangePoints, penalty = "Manual", method = "BinSeg", class = FALSE, minseglen = MinLength, param.estimates = FALSE, pen.value = 0.0001)) - 0.5 bins <- cut(seq_along(sig), c(0, cp0, length(sig)), include.lowest = T, labels = FALSE) if(StateStat == "Mean") { bin_sig <- as.numeric(by(sig, bins, mean)) } else { bin_sig <- as.numeric(by(sig, bins, median)) } return(bin_sig) }
files <- list.files("/mnt/raid61/Personal_data/songjunwei/DRS_RTA/batch3_fq/DRS_multiplex3/Pb_9sample/20210918_2113_MN27836_FAQ60861_83a343d5", "fast5$", recursive = TRUE, full.names = TRUE)
barcodeSigs <- lapply(files, function(file1) { print(file1) fast5_1 <- readFast5(file = file1, NT = 10) barcode <- mclapply(fast5_1, function(x) GetBarcode3(read = x, plot = F), mc.cores = 10) names(barcode) <- names(fast5_1) barcode[!mapply(is.null, barcode)] }) barcodeSigs <- do.call(c, barcodeSigs)
barcodeSigsBin <- pblapply(FUN = function(x) { MyChangePoint(sig = x, ChangePoints = 98, MinLength = 10, StateStat = "Mean") }, barcodeSigs, cl = 4)
barcodeSigsBinMat <- as.data.frame(do.call(rbind, barcodeSigsBin)) colnames(barcodeSigsBinMat) <- paste0("BIN", sprintf("%03d", seq_len(ncol(barcodeSigsBinMat)))) saveRDS(barcodeSigsBinMat, "./analysis/08.m6A/01.Plasmodium/20211008/01.BarcodeDecomplex/barcodeSigsBinMat.Rds")
barcodeSigsBinMat <- readRDS("./analysis/08.m6A/01.Plasmodium/20211008/01.BarcodeDecomplex/barcodeSigsBinMat.Rds") load(file = "./analysis/07.VirusClassification/03.Merge/01.Classifiers/20211008_9.RData")
Fit1
ROC_Tab <- data.frame(predict(Fit1, barcodeSigsBinMat, type = "prob"), pred = predict(Fit1, newdata = barcodeSigsBinMat)) saveRDS(ROC_Tab, "./analysis/08.m6A/01.Plasmodium/20211008/01.BarcodeDecomplex/BarcodesPrediction.Rds")
Preds <- readRDS("./analysis/08.m6A/01.Plasmodium/20211008/01.BarcodeDecomplex/BarcodesPrediction.Rds") colnames(Preds) <- gsub("\\.", "-", colnames(Preds)) Preds <- as.data.table(Preds, keep.rownames = "qname") Preds$PP <- apply(Preds[, grepl("RTA", colnames(Preds)), with = F], 1, max)
Preds[, .N, pred] ggplot(Preds, aes(x = pred, y = PP)) + geom_violin() + scale_y_continuous(limits = c(0, 1)) + theme_classic(base_size = 15)
Mat <- copy(Preds) lapply(seq(0, 100, 5)/100, function(i) { ClassifiedReads <- Mat[, sum(PP >= i)] ReadsPercent <- Mat[, mean(PP >= i) * 100] data.frame(ClassifiedReads = ClassifiedReads, ClassifiedReadsPercent = ReadsPercent) }) -> Cutoff_Select Cutoff_Select <- as.data.table(do.call(rbind, Cutoff_Select)) Cutoff_Select$Cutoff <- seq(0, 100, 5)/100
ggplot(Cutoff_Select, aes(Cutoff, ClassifiedReadsPercent)) + geom_line() + labs(x = "PP", y = "Percentage of successful reads") + theme_classic() + theme(legend.position = "none", axis.title = element_text(size = 16), axis.text = element_text(size = 12)) + geom_vline(xintercept = 0.3)
Preds[PP > 0.0, .N, pred][order(pred), ] Preds[PP > 0.3, .N, pred][order(pred), ] Preds[PP > 0.5, .N, pred][order(pred), ]
bams <- file.path("/mnt/raid61/Personal_data/songjunwei/DRS_RTA/align/batch3_9Pb/PbergheiANKA.bam")
aligns <- lapply(bams, function(bamFile) { bam <- GenomicAlignments::readGAlignments(file = bamFile, param = Rsamtools::ScanBamParam(what = c("qname", "flag", "mapq"))) data.table(Species = gsub(".bam", "", basename(bamFile)), as.data.frame(mcols(bam)), Length = qwidth(bam)) }) aligns <- do.call(rbind, aligns) aligns <- na.omit(aligns) aligns[, qname := paste0("read_", qname)] aligns <- aligns[!flag %in% c(2048, 2064)] aligns <- aligns[mapq >= 60]
Preds <- Preds[qname %in% aligns[, qname]]
Preds[PP <= 0.3, pred := "Unclassified"]
library(ShortRead) fastq <- fread("/mnt/raid61/Personal_data/songjunwei/DRS_RTA/batch3_fq/DRS_multi3_9Pb.fastq", sep = "\t", header = F) rid <- gsub("^@", "read_", mapply(function(x) x[1], strsplit(fastq[grepl("^@", V1)][[1]], " "))) fa <- RNAStringSet(fastq[fastq[, grep("^@", V1)] + 1, ][[1]]) names(fa) <- rid ReadLength <- data.table(qname = names(fa), Length = width(fa)) aligns <- merge(aligns, ReadLength, by = "qname")
qual <- PhredQuality(fastq[grep("^@", V1) + 3][[1]]) names(qual) <- gsub("^@", "", fastq[grep("^@", V1)][[1]]) names(fa) <- gsub("^@", "", fastq[grep("^@", V1)][[1]]) fastq <- QualityScaledDNAStringSet(x = fa, quality = qual)
setdiff(Preds[, levels(pred)], "Unclassified") for(i in setdiff(Preds[, levels(pred)], "Unclassified")) { fi <- fastq[paste0("read_", mapply(function(x) x[1], strsplit(names(fastq), " "))) %in% Preds[pred == i, qname]] writeQualityScaledXStringSet(fi, paste0("./analysis/08.m6A/01.Plasmodium/20211008/02.SplitFastq/", i, ".fastq")) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.